loading packages
Loading both training and testing data
dataFrameTrainLoad <- read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dataFrameTestLoad <- read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dim(dataFrameTrainLoad); dim(dataFrameTestLoad)
## [1] 19622 160
## [1] 20 160
informationOfTraining <- createDataPartition(dataFrameTrainLoad$classe, p = 0.8, list = F)
valueOfDifferent <- dataFrameTrainLoad[-informationOfTraining,]
dataFrameTrainLoad <- dataFrameTrainLoad[informationOfTraining,]
dim(dataFrameTrainLoad); dim(valueOfDifferent)
## [1] 15699 160
## [1] 3923 160
table(dataFrameTrainLoad$classe)/nrow(dataFrameTrainLoad)
##
## A B C D E
## 0.2843493 0.1935155 0.1744060 0.1638958 0.1838334
missingBeltVariable <- sapply(select(dataFrameTrainLoad,names(dataFrameTrainLoad)[grepl("_belt",names(dataFrameTrainLoad))]),
function(x) sum(is.na(x)))
missingBeltVariable
## roll_belt pitch_belt yaw_belt
## 0 0 0
## total_accel_belt kurtosis_roll_belt kurtosis_picth_belt
## 0 15382 15400
## kurtosis_yaw_belt skewness_roll_belt skewness_roll_belt.1
## 15699 15381 15400
## skewness_yaw_belt max_roll_belt max_picth_belt
## 15699 15374 15374
## max_yaw_belt min_roll_belt min_pitch_belt
## 15382 15374 15374
## min_yaw_belt amplitude_roll_belt amplitude_pitch_belt
## 15382 15374 15374
## amplitude_yaw_belt var_total_accel_belt avg_roll_belt
## 15382 15374 15374
## stddev_roll_belt var_roll_belt avg_pitch_belt
## 15374 15374 15374
## stddev_pitch_belt var_pitch_belt avg_yaw_belt
## 15374 15374 15374
## stddev_yaw_belt var_yaw_belt gyros_belt_x
## 15374 15374 0
## gyros_belt_y gyros_belt_z accel_belt_x
## 0 0 0
## accel_belt_y accel_belt_z magnet_belt_x
## 0 0 0
## magnet_belt_y magnet_belt_z
## 0 0
missingArmVariable <- sapply(select(dataFrameTrainLoad,names(dataFrameTrainLoad)[grepl("_arm",names(dataFrameTrainLoad))]),
function(x) sum(is.na(x)))
missingArmVariable
## roll_arm pitch_arm yaw_arm total_accel_arm
## 0 0 0 0
## var_accel_arm avg_roll_arm stddev_roll_arm var_roll_arm
## 15374 15374 15374 15374
## avg_pitch_arm stddev_pitch_arm var_pitch_arm avg_yaw_arm
## 15374 15374 15374 15374
## stddev_yaw_arm var_yaw_arm gyros_arm_x gyros_arm_y
## 15374 15374 0 0
## gyros_arm_z accel_arm_x accel_arm_y accel_arm_z
## 0 0 0 0
## magnet_arm_x magnet_arm_y magnet_arm_z kurtosis_roll_arm
## 0 0 0 15435
## kurtosis_picth_arm kurtosis_yaw_arm skewness_roll_arm skewness_pitch_arm
## 15437 15384 15434 15437
## skewness_yaw_arm max_roll_arm max_picth_arm max_yaw_arm
## 15384 15374 15374 15374
## min_roll_arm min_pitch_arm min_yaw_arm amplitude_roll_arm
## 15374 15374 15374 15374
## amplitude_pitch_arm amplitude_yaw_arm
## 15374 15374
missingforearmVariable <- sapply(select(dataFrameTrainLoad,
names(dataFrameTrainLoad)[grepl("_forearm",names(dataFrameTrainLoad))]),
function(x) sum(is.na(x)))
missingforearmVariable
## roll_forearm pitch_forearm yaw_forearm
## 0 0 0
## kurtosis_roll_forearm kurtosis_picth_forearm kurtosis_yaw_forearm
## 15443 15444 15699
## skewness_roll_forearm skewness_pitch_forearm skewness_yaw_forearm
## 15442 15444 15699
## max_roll_forearm max_picth_forearm max_yaw_forearm
## 15374 15374 15443
## min_roll_forearm min_pitch_forearm min_yaw_forearm
## 15374 15374 15443
## amplitude_roll_forearm amplitude_pitch_forearm amplitude_yaw_forearm
## 15374 15374 15443
## total_accel_forearm var_accel_forearm avg_roll_forearm
## 0 15374 15374
## stddev_roll_forearm var_roll_forearm avg_pitch_forearm
## 15374 15374 15374
## stddev_pitch_forearm var_pitch_forearm avg_yaw_forearm
## 15374 15374 15374
## stddev_yaw_forearm var_yaw_forearm gyros_forearm_x
## 15374 15374 0
## gyros_forearm_y gyros_forearm_z accel_forearm_x
## 0 0 0
## accel_forearm_y accel_forearm_z magnet_forearm_x
## 0 0 0
## magnet_forearm_y magnet_forearm_z
## 0 0
missingdumbbellVariable <- sapply(select(dataFrameTrainLoad,
names(dataFrameTrainLoad)[grepl("_dumbbell",names(dataFrameTrainLoad))]),
function(x) sum(is.na(x)))
missingdumbbellVariable
## roll_dumbbell pitch_dumbbell yaw_dumbbell
## 0 0 0
## kurtosis_roll_dumbbell kurtosis_picth_dumbbell kurtosis_yaw_dumbbell
## 15379 15376 15699
## skewness_roll_dumbbell skewness_pitch_dumbbell skewness_yaw_dumbbell
## 15378 15375 15699
## max_roll_dumbbell max_picth_dumbbell max_yaw_dumbbell
## 15374 15374 15379
## min_roll_dumbbell min_pitch_dumbbell min_yaw_dumbbell
## 15374 15374 15379
## amplitude_roll_dumbbell amplitude_pitch_dumbbell amplitude_yaw_dumbbell
## 15374 15374 15379
## total_accel_dumbbell var_accel_dumbbell avg_roll_dumbbell
## 0 15374 15374
## stddev_roll_dumbbell var_roll_dumbbell avg_pitch_dumbbell
## 15374 15374 15374
## stddev_pitch_dumbbell var_pitch_dumbbell avg_yaw_dumbbell
## 15374 15374 15374
## stddev_yaw_dumbbell var_yaw_dumbbell gyros_dumbbell_x
## 15374 15374 0
## gyros_dumbbell_y gyros_dumbbell_z accel_dumbbell_x
## 0 0 0
## accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x
## 0 0 0
## magnet_dumbbell_y magnet_dumbbell_z
## 0 0
dropColumn2Variable <- c(names(missingBeltVariable[missingBeltVariable != 0]),
names(missingArmVariable[missingArmVariable != 0]),
names(missingforearmVariable[missingforearmVariable != 0]),
names(missingdumbbellVariable[missingdumbbellVariable != 0]))
length(dropColumn2Variable)
## [1] 100
differenceAnalizeVariable <- tbl_df(dataFrameTrainLoad %>%
select(-dropColumn2Variable,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(dropColumn2Variable)` instead of `dropColumn2Variable` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
differenceAnalizeVariable$classe <- as.factor(differenceAnalizeVariable$classe)
differenceAnalizeVariable[,1:52] <- lapply(differenceAnalizeVariable[,1:52],as.numeric)
dim(differenceAnalizeVariable)
## [1] 15699 53
correlation_columnVariable <- cor(select(differenceAnalizeVariable, -classe))
diag(correlation_columnVariable) <- 0
correlation_columnVariable <- which(abs(correlation_columnVariable)>0.8,arr.ind = T)
correlation_columnVariable <- unique(row.names(correlation_columnVariable))
corrplot(cor(select(differenceAnalizeVariable,correlation_columnVariable)),
type="upper", order="hclust",method = "number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(correlation_columnVariable)` instead of `correlation_columnVariable` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
correlation_functionDifferenceVariable <- differenceAnalizeVariable %>% binarize(n_bins = 4, thresh_infreq = 0.01)
correlation_Variablea <- correlation_functionDifferenceVariable %>% correlate(target = classe__A)
correlation_Variablea %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
correlation_Variableb <- correlation_functionDifferenceVariable %>% correlate(target = classe__B)
correlation_Variableb %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Correlation variable number c
correlation_Variablec <- correlation_functionDifferenceVariable %>% correlate(target = classe__C)
correlation_Variablec %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Temporary variables number c
Temporary variable number d
correaltion_Variabled <- correlation_functionDifferenceVariable %>% correlate(target = classe__D)
correaltion_Variabled %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Temporary Variables
correlation_Variablee <- correlation_functionDifferenceVariable %>% correlate(target = classe__E)
correlation_Variablee %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
columnVariablea <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y",
"roll_forearm", "gyros_dumbbell_y")
columnVariableb <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" ,
"magnet_belt_y" , "accel_dumbbell_x" )
columnVariablec <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" ,
"magnet_dumbbell_x", "magnet_dumbbell_z")
columnVariabled <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
"accel_dumbbell_y", "accel_forearm_x")
columnVariablee <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt",
"gyros_belt_z" , "magnet_dumbbell_y")
finalColumnVariablee <- character()
for(c in c(columnVariablea,columnVariableb,columnVariablec,columnVariabled,columnVariablee)){
finalColumnVariablee <- union(finalColumnVariablee, c)
}
dataFrameAnalize2Variable <- differenceAnalizeVariable %>% select(finalColumnVariablee, classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(finalColumnVariablee)` instead of `finalColumnVariablee` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",finalColumnVariablee)),
"forearm" = sum(grepl("_forearm",finalColumnVariablee)),
"belt" = sum(grepl("_belt",finalColumnVariablee)),
"dumbbell" = sum(grepl("_dumbbell",finalColumnVariablee)))
## arm forearm belt dumbbell
## 1 2 4 4 7
Ploting density and point variables
my_densityVariable <- function(data, mapping, ...) {
ggplot(data = data, mapping=mapping) +
geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2")
}
my_pointVariable <- function(data, mapping, ...) {
ggplot(data = data, mapping=mapping) +
geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2")
}
ggpairs(dataFrameAnalize2Variable, columns = 1:5,aes(color = classe),
lower = list(continuous = my_pointVariable),diag = list(continuous = my_densityVariable))
Ploting some data
ggpairs(dataFrameAnalize2Variable, columns = 6:10,aes(color = classe),
lower = list(continuous = my_pointVariable),diag = list(continuous = my_densityVariable))
ggpairs(dataFrameAnalize2Variable, columns = 11:17,aes(color = classe),
lower = list(continuous = my_pointVariable),diag = list(continuous = my_densityVariable))
dfTrainF <- dataFrameTrainLoad %>% select(finalColumnVariablee,classe)
dfValF <- valueOfDifferent %>% select(finalColumnVariablee,classe)
dfTrainF[,1:17] <- sapply(dfTrainF[,1:17],as.numeric)
dfValF[,1:17] <- sapply(dfValF[,1:17],as.numeric)
levels <- c("A", "B", "C", "D", "E")
preprop_obj <- preProcess(dfTrainF[,-18],method = c("center","scale","BoxCox"))
xTrain <- predict(preprop_obj,select(dfTrainF,-classe))
yTrain <- factor(dfTrainF$classe,levels=levels)
xVal <- predict(preprop_obj,select(dfValF,-classe))
yVal <- factor(dfValF$classe,levels=levels)
trControl <- trainControl(method="cv", number=5)
modelCT <- train(x = xTrain,y = yTrain,
method = "rpart", trControl = trControl)
modelRF <- train(x = xTrain,y = yTrain,
method = "rf", trControl = trControl,verbose=FALSE, metric = "Accuracy")
modelGBM <- train(x = xTrain,y = yTrain,
method = "gbm",trControl=trControl, verbose=FALSE)
modelSVM <- svm(x = xTrain,y = yTrain,
kernel = "polynomial", cost = 10)
confusionMatrix(predict(modelCT,xVal),yVal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1026 307 337 284 100
## B 17 248 22 120 91
## C 71 204 325 239 193
## D 0 0 0 0 0
## E 2 0 0 0 337
##
## Overall Statistics
##
## Accuracy : 0.4935
## 95% CI : (0.4777, 0.5093)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3377
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9194 0.32675 0.47515 0.0000 0.46741
## Specificity 0.6338 0.92099 0.78172 1.0000 0.99938
## Pos Pred Value 0.4995 0.49799 0.31492 NaN 0.99410
## Neg Pred Value 0.9518 0.85080 0.87582 0.8361 0.89286
## Prevalence 0.2845 0.19347 0.17436 0.1639 0.18379
## Detection Rate 0.2615 0.06322 0.08284 0.0000 0.08590
## Detection Prevalence 0.5236 0.12694 0.26306 0.0000 0.08641
## Balanced Accuracy 0.7766 0.62387 0.62843 0.5000 0.73339
confusionMatrix(predict(modelRF,xVal),yVal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1113 5 0 0 0
## B 0 741 3 2 0
## C 3 12 679 10 0
## D 0 1 2 631 2
## E 0 0 0 0 719
##
## Overall Statistics
##
## Accuracy : 0.9898
## 95% CI : (0.9861, 0.9927)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9871
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9973 0.9763 0.9927 0.9813 0.9972
## Specificity 0.9982 0.9984 0.9923 0.9985 1.0000
## Pos Pred Value 0.9955 0.9933 0.9645 0.9921 1.0000
## Neg Pred Value 0.9989 0.9943 0.9984 0.9963 0.9994
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2837 0.1889 0.1731 0.1608 0.1833
## Detection Prevalence 0.2850 0.1902 0.1795 0.1621 0.1833
## Balanced Accuracy 0.9978 0.9874 0.9925 0.9899 0.9986
Ploting error vs numbet of tree
plot(modelRF$finalModel,main="Error VS no of tree")
confusion matrix and overall statistics
confusionMatrix(predict(modelGBM,xVal),yVal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1090 23 1 2 4
## B 9 643 35 19 12
## C 13 58 632 56 15
## D 4 35 16 562 14
## E 0 0 0 4 676
##
## Overall Statistics
##
## Accuracy : 0.9184
## 95% CI : (0.9094, 0.9268)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8968
##
## Mcnemar's Test P-Value : 2.912e-14
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9767 0.8472 0.9240 0.8740 0.9376
## Specificity 0.9893 0.9763 0.9562 0.9790 0.9988
## Pos Pred Value 0.9732 0.8955 0.8165 0.8906 0.9941
## Neg Pred Value 0.9907 0.9638 0.9835 0.9754 0.9861
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2778 0.1639 0.1611 0.1433 0.1723
## Detection Prevalence 0.2855 0.1830 0.1973 0.1608 0.1733
## Balanced Accuracy 0.9830 0.9117 0.9401 0.9265 0.9682
Prediction ,confusion Matrix
confusionMatrix(predict(modelSVM,xVal),yVal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1106 44 20 21 2
## B 1 668 9 2 3
## C 6 40 645 45 6
## D 3 6 10 575 16
## E 0 1 0 0 694
##
## Overall Statistics
##
## Accuracy : 0.9401
## 95% CI : (0.9322, 0.9473)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9241
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9910 0.8801 0.9430 0.8942 0.9626
## Specificity 0.9690 0.9953 0.9701 0.9893 0.9997
## Pos Pred Value 0.9271 0.9780 0.8693 0.9426 0.9986
## Neg Pred Value 0.9963 0.9719 0.9877 0.9795 0.9916
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2819 0.1703 0.1644 0.1466 0.1769
## Detection Prevalence 0.3041 0.1741 0.1891 0.1555 0.1772
## Balanced Accuracy 0.9800 0.9377 0.9565 0.9418 0.9811
Getting Result
dfTest2 <- dataFrameTestLoad %>% select(finalColumnVariablee,problem_id)
xTest <- dfTest2 %>% select(finalColumnVariablee)
resultvaiable <- data.frame("problem_id" = dataFrameTestLoad$problem_id,
"PREDICTION_RF" = predict(modelRF,xTest),
"PREDICTION_GBM" = predict(modelGBM,xTest),
"PREDICTION_SVM" = predict(modelSVM,xTest))
resultvaiable
## problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1 1 E E C
## 2 2 A E A
## 3 3 A E B
## 4 4 E E A
## 5 5 A E A
## 6 6 E D A
## 7 7 E E B
## 8 8 B D B
## 9 9 A D E
## 10 10 E E E
## 11 11 A E B
## 12 12 A D A
## 13 13 B E E
## 14 14 A D B
## 15 15 E E B
## 16 16 E E A
## 17 17 E E C
## 18 18 B E A
## 19 19 E E A
## 20 20 E E D